setwd('/Users/wendyvu/Documents/PCA_txns/')
df <- read.csv('/Users/wendyvu/Documents/PCA_txns/txn_pca_cohort_SeptOct2018_202002261339.csv',header=T)

#library(data.table)
#df <- df[df[,3] %like% "2018-10",]
dim(df)
## [1] 163752    140
# replace NA's with 0's
df[is.na(df)] <- 0
summary(df)
##        id                      user_created   
##  Min.   :     1   2018-09-01 09:26:25:     4  
##  1st Qu.: 40939   2018-09-05 18:16:52:     3  
##  Median : 81876   2018-09-06 11:16:54:     3  
##  Mean   : 81876   2018-09-09 18:01:11:     3  
##  3rd Qu.:122814   2018-09-10 10:33:50:     3  
##  Max.   :163752   2018-09-11 08:53:21:     3  
##                   (Other)            :163733  
##                   kycc           membership    
##  2018-09-09 14:09:00:     5   premium : 10620  
##  2018-09-25 06:51:45:     5   standard:153132  
##  2018-09-25 07:55:18:     5                    
##  2018-09-25 13:26:22:     5                    
##  2018-09-25 16:28:24:     5                    
##  2018-09-10 10:52:31:     4                    
##  (Other)            :163723                    
##                 product_id         market       nationality   
##  BLACK_CARD_MONTHLY  :  5924   FRA    :50361   FRA    :46582  
##  BUSINESS_BLACK      :  1160   DEU    :37913   ITA    :36359  
##  BUSINESS_CARD       : 13478   ITA    :37142   DEU    :22804  
##  FLEX_ACCOUNT_MONTHLY:  1323   RoE    :17835   ESP    :13328  
##  METAL_CARD_MONTHLY  :  3536   ESP    :15709   BRA    : 4780  
##  STANDARD            :138331   AUT    : 4710   AUT    : 3568  
##                                (Other):   82   (Other):36331  
##   nat_status                referral          closed_at     
##  expat : 35018   friend_referral:32312             :141270  
##  native:128734   organic_search :28186   2019-07-30:  1492  
##                  paid_search    :22492   2019-01-10:   870  
##                  app_store      :22086   2019-08-25:   734  
##                                 :15637   2019-07-29:   626  
##                  paid_social    :13452   2019-01-24:   496  
##                  (Other)        :29587   (Other)   : 18264  
##  kycc_closed_days    gender         age_group          age        
##  Min.   :-14.00   FEMALE: 47482   20-24  :37285   Min.   : 20.00  
##  1st Qu.:  0.00   MALE  :116270   25-29  :35499   1st Qu.: 25.00  
##  Median :  0.00                   30-34  :25225   Median : 31.00  
##  Mean   : 35.18                   35-39  :17463   Mean   : 34.37  
##  3rd Qu.:  0.00                   40-44  :12616   3rd Qu.: 41.00  
##  Max.   :542.00                   45-49  :10202   Max.   :120.00  
##                                   (Other):25462                   
##  weeks_wau_txn        mau_txn          months_sau         mau_act      
##  Min.   :-417.00   Min.   :-16.000   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.:   2.00   1st Qu.:  2.000   1st Qu.: 0.0000   1st Qu.: 1.000  
##  Median :  14.00   Median :  6.000   Median : 0.0000   Median : 4.000  
##  Mean   :  20.59   Mean   :  6.298   Mean   : 0.6625   Mean   : 4.843  
##  3rd Qu.:  38.00   3rd Qu.: 11.000   3rd Qu.: 0.0000   3rd Qu.: 9.000  
##  Max.   :  60.00   Max.   : 12.000   Max.   :15.0000   Max.   :12.000  
##                                                                        
##     n_pt_dom         n_pt_intra        n_pt_inter         n_pt_ecomm     
##  Min.   :   0.00   Min.   :   0.00   Min.   :   0.000   Min.   :   0.00  
##  1st Qu.:   0.00   1st Qu.:   0.00   1st Qu.:   0.000   1st Qu.:   0.00  
##  Median :   5.00   Median :   0.00   Median :   0.000   Median :   2.00  
##  Mean   :  48.62   Mean   :  12.86   Mean   :   5.696   Mean   :  16.01  
##  3rd Qu.:  53.00   3rd Qu.:   8.00   3rd Qu.:   0.000   3rd Qu.:  15.00  
##  Max.   :1525.00   Max.   :1588.00   Max.   :1514.000   Max.   :4216.00  
##                                                                          
##   n_pt_dom_atm     n_pt_intra_atm     n_pt_inter_atm         n_dt         
##  Min.   :  0.000   Min.   :  0.0000   Min.   :  0.000   Min.   :   0.000  
##  1st Qu.:  0.000   1st Qu.:  0.0000   1st Qu.:  0.000   1st Qu.:   0.000  
##  Median :  1.000   Median :  0.0000   Median :  0.000   Median :   0.000  
##  Mean   :  6.314   Mean   :  0.8071   Mean   :  1.089   Mean   :   5.963  
##  3rd Qu.:  5.000   3rd Qu.:  0.0000   3rd Qu.:  0.000   3rd Qu.:   4.000  
##  Max.   :367.000   Max.   :196.0000   Max.   :416.000   Max.   :1184.000  
##                                                                           
##       n_dd               n_ft            n_cash26             n_ct        
##  Min.   :   0.000   Min.   :  0.000   Min.   :  0.0000   Min.   :   0.00  
##  1st Qu.:   0.000   1st Qu.:  0.000   1st Qu.:  0.0000   1st Qu.:   1.00  
##  Median :   0.000   Median :  0.000   Median :  0.0000   Median :   6.00  
##  Mean   :   3.924   Mean   :  1.263   Mean   :  0.2392   Mean   :  14.12  
##  3rd Qu.:   0.000   3rd Qu.:  0.000   3rd Qu.:  0.0000   3rd Qu.:  18.00  
##  Max.   :3114.000   Max.   :312.000   Max.   :197.0000   Max.   :4403.00  
##                                                                           
##       n_wu            n_space_ct         n_space_dt      
##  Min.   :  0.0000   Min.   :   0.000   Min.   :   0.000  
##  1st Qu.:  0.0000   1st Qu.:   0.000   1st Qu.:   0.000  
##  Median :  0.0000   Median :   0.000   Median :   0.000  
##  Mean   :  0.2922   Mean   :   6.498   Mean   :   6.498  
##  3rd Qu.:  0.0000   3rd Qu.:   0.000   3rd Qu.:   0.000  
##  Max.   :108.0000   Max.   :1292.000   Max.   :1292.000  
##                                                          
##    pt_dom_sum         pt_intra_sum       pt_inter_sum    
##  Min.   :     0.00   Min.   :     0.0   Min.   :    0.0  
##  1st Qu.:     0.00   1st Qu.:     0.0   1st Qu.:    0.0  
##  Median :    75.16   Median :     0.0   Median :    0.0  
##  Mean   :  1142.74   Mean   :   311.7   Mean   :  191.8  
##  3rd Qu.:  1106.82   3rd Qu.:   165.5   3rd Qu.:    0.0  
##  Max.   :115460.67   Max.   :103485.7   Max.   :74898.3  
##                                                          
##   pt_ecomm_sum       pt_dom_atm_sum     pt_intra_atm_sum  
##  Min.   :     0.00   Min.   :     0.0   Min.   :    0.00  
##  1st Qu.:     0.00   1st Qu.:     0.0   1st Qu.:    0.00  
##  Median :    38.94   Median :    10.0   Median :    0.00  
##  Mean   :   806.31   Mean   :   936.5   Mean   :   99.54  
##  3rd Qu.:   666.12   3rd Qu.:   450.0   3rd Qu.:    0.00  
##  Max.   :158778.06   Max.   :169590.0   Max.   :96618.66  
##                                                           
##  pt_inter_atm_sum      dt_sum              dd_sum        
##  Min.   :    0.0   Min.   :      0.0   Min.   :     0.0  
##  1st Qu.:    0.0   1st Qu.:      0.0   1st Qu.:     0.0  
##  Median :    0.0   Median :      0.0   Median :     0.0  
##  Mean   :  163.4   Mean   :   2602.6   Mean   :   318.2  
##  3rd Qu.:    0.0   3rd Qu.:    766.1   3rd Qu.:     0.0  
##  Max.   :84347.5   Max.   :2000000.0   Max.   :179000.0  
##                                                          
##      ft_sum           cash26_sum           ct_sum         
##  Min.   :    0.00   Min.   :    0.00   Min.   :      0.0  
##  1st Qu.:    0.00   1st Qu.:    0.00   1st Qu.:      4.1  
##  Median :    0.00   Median :    0.00   Median :   1301.9  
##  Mean   :   95.78   Mean   :   58.96   Mean   :   7419.7  
##  3rd Qu.:    0.00   3rd Qu.:    0.00   3rd Qu.:   7135.0  
##  Max.   :95530.00   Max.   :91489.97   Max.   :2202467.4  
##                                                           
##      wu_sum        avg_primary_bal     avg_spaces_bal    
##  Min.   :   0.00   Min.   : -11121.2   Min.   :     0.0  
##  1st Qu.:   0.00   1st Qu.:      0.0   1st Qu.:     0.0  
##  Median :   0.00   Median :     40.2   Median :     0.0  
##  Mean   :   4.28   Mean   :    598.3   Mean   :   129.7  
##  3rd Qu.:   0.00   3rd Qu.:    295.7   3rd Qu.:     0.0  
##  Max.   :1620.00   Max.   :1428571.4   Max.   :136363.6  
##                                                          
##     n_spaces        avg_space_ct      avg_space_dt       n_ext_out     
##  Min.   : 0.0000   Min.   :      0   Min.   :      0   Min.   :   0.0  
##  1st Qu.: 0.0000   1st Qu.:      0   1st Qu.:      0   1st Qu.:   1.0  
##  Median : 0.0000   Median :      0   Median :      0   Median :  27.0  
##  Mean   : 0.4746   Mean   :   1201   Mean   :   1201   Mean   : 102.7  
##  3rd Qu.: 1.0000   3rd Qu.:      0   3rd Qu.:      0   3rd Qu.: 134.0  
##  Max.   :37.0000   Max.   :8900000   Max.   :8900000   Max.   :4250.0  
##                                                                        
##     n_ext_in        ext_out_sum          ext_in_sum      grocery_market   
##  Min.   :   0.00   Min.   :      0.0   Min.   :      0   Min.   :   0.00  
##  1st Qu.:   1.00   1st Qu.:      9.6   1st Qu.:     10   1st Qu.:   0.00  
##  Median :   7.00   Median :   1184.8   Median :   1386   Median :   1.00  
##  Mean   :  15.61   Mean   :   6702.6   Mean   :   7564   Mean   :  16.33  
##  3rd Qu.:  20.00   3rd Qu.:   6603.2   3rd Qu.:   7364   3rd Qu.:  14.00  
##  Max.   :4407.00   Max.   :2000000.0   Max.   :2203650   Max.   :1099.00  
##                                                                           
##    restaurant           atm            fast_food       local_transport  
##  Min.   :  0.000   Min.   :  0.000   Min.   :  0.000   Min.   :  0.000  
##  1st Qu.:  0.000   1st Qu.:  0.000   1st Qu.:  0.000   1st Qu.:  0.000  
##  Median :  0.000   Median :  1.000   Median :  0.000   Median :  0.000  
##  Mean   :  9.094   Mean   :  8.204   Mean   :  5.715   Mean   :  4.083  
##  3rd Qu.:  7.000   3rd Qu.:  9.000   3rd Qu.:  5.000   3rd Qu.:  2.000  
##  Max.   :690.000   Max.   :417.000   Max.   :525.000   Max.   :419.000  
##                                                                         
##     clothing           retail           household        gas_service     
##  Min.   :  0.000   Min.   :   0.000   Min.   :  0.000   Min.   :  0.000  
##  1st Qu.:  0.000   1st Qu.:   0.000   1st Qu.:  0.000   1st Qu.:  0.000  
##  Median :  0.000   Median :   0.000   Median :  0.000   Median :  0.000  
##  Mean   :  3.668   Mean   :   3.687   Mean   :  3.635   Mean   :  3.171  
##  3rd Qu.:  4.000   3rd Qu.:   4.000   3rd Qu.:  2.000   3rd Qu.:  1.000  
##  Max.   :833.000   Max.   :1440.000   Max.   :365.000   Max.   :552.000  
##                                                                          
##   food_drinks         taxicabs        drug_pharma        bars_clubs     
##  Min.   :  0.000   Min.   :  0.000   Min.   :  0.000   Min.   :  0.000  
##  1st Qu.:  0.000   1st Qu.:  0.000   1st Qu.:  0.000   1st Qu.:  0.000  
##  Median :  0.000   Median :  0.000   Median :  0.000   Median :  0.000  
##  Mean   :  2.578   Mean   :  2.242   Mean   :  2.007   Mean   :  1.979  
##  3rd Qu.:  1.000   3rd Qu.:  0.000   3rd Qu.:  1.000   3rd Qu.:  1.000  
##  Max.   :525.000   Max.   :688.000   Max.   :274.000   Max.   :318.000  
##                                                                         
##  car_toll_parking  entertainment       utilities       subscriptions     
##  Min.   :  0.000   Min.   :  0.000   Min.   :  0.000   Min.   :   0.000  
##  1st Qu.:  0.000   1st Qu.:  0.000   1st Qu.:  0.000   1st Qu.:   0.000  
##  Median :  0.000   Median :  0.000   Median :  0.000   Median :   0.000  
##  Mean   :  1.762   Mean   :  1.603   Mean   :  1.506   Mean   :   1.347  
##  3rd Qu.:  0.000   3rd Qu.:  1.000   3rd Qu.:  1.000   3rd Qu.:   0.000  
##  Max.   :568.000   Max.   :548.000   Max.   :372.000   Max.   :2431.000  
##                                                                          
##    bookstores      business_org_serv  hotel_lodge     computer_electronic
##  Min.   :  0.000   Min.   :  0.000   Min.   :  0.00   Min.   :  0.000    
##  1st Qu.:  0.000   1st Qu.:  0.000   1st Qu.:  0.00   1st Qu.:  0.000    
##  Median :  0.000   Median :  0.000   Median :  0.00   Median :  0.000    
##  Mean   :  1.291   Mean   :  1.414   Mean   :  1.27   Mean   :  1.228    
##  3rd Qu.:  0.000   3rd Qu.:  1.000   3rd Qu.:  1.00   3rd Qu.:  1.000    
##  Max.   :851.000   Max.   :731.000   Max.   :130.00   Max.   :409.000    
##                                                                          
##     bakeries       gambling_gaming    record_stores     
##  Min.   :  0.000   Min.   :   0.000   Min.   :  0.0000  
##  1st Qu.:  0.000   1st Qu.:   0.000   1st Qu.:  0.0000  
##  Median :  0.000   Median :   0.000   Median :  0.0000  
##  Mean   :  1.233   Mean   :   1.137   Mean   :  0.9117  
##  3rd Qu.:  0.000   3rd Qu.:   0.000   3rd Qu.:  0.0000  
##  Max.   :265.000   Max.   :1186.000   Max.   :350.0000  
##                                                         
##  digital_goods         airline         beauty_stores     
##  Min.   :  0.0000   Min.   :  0.0000   Min.   :  0.0000  
##  1st Qu.:  0.0000   1st Qu.:  0.0000   1st Qu.:  0.0000  
##  Median :  0.0000   Median :  0.0000   Median :  0.0000  
##  Mean   :  0.8821   Mean   :  0.7223   Mean   :  0.6941  
##  3rd Qu.:  0.0000   3rd Qu.:  0.0000   3rd Qu.:  0.0000  
##  Max.   :405.0000   Max.   :241.0000   Max.   :119.0000  
##                                                          
##  transport_serv     fines_tax_gov      money_financial   
##  Min.   :  0.0000   Min.   :  0.0000   Min.   :  0.0000  
##  1st Qu.:  0.0000   1st Qu.:  0.0000   1st Qu.:  0.0000  
##  Median :  0.0000   Median :  0.0000   Median :  0.0000  
##  Mean   :  0.6354   Mean   :  0.6428   Mean   :  0.5297  
##  3rd Qu.:  0.0000   3rd Qu.:  0.0000   3rd Qu.:  0.0000  
##  Max.   :229.0000   Max.   :363.0000   Max.   :288.0000  
##                                                          
##  professional_serv  discount_stores    travel_agencies   
##  Min.   :  0.0000   Min.   :  0.0000   Min.   :  0.0000  
##  1st Qu.:  0.0000   1st Qu.:  0.0000   1st Qu.:  0.0000  
##  Median :  0.0000   Median :  0.0000   Median :  0.0000  
##  Mean   :  0.5188   Mean   :  0.5192   Mean   :  0.4319  
##  3rd Qu.:  0.0000   3rd Qu.:  0.0000   3rd Qu.:  0.0000  
##  Max.   :843.0000   Max.   :831.0000   Max.   :157.0000  
##                                                          
##  computer_data_serv   car_rental        health_serv     
##  Min.   :  0.0000   Min.   :  0.0000   Min.   : 0.0000  
##  1st Qu.:  0.0000   1st Qu.:  0.0000   1st Qu.: 0.0000  
##  Median :  0.0000   Median :  0.0000   Median : 0.0000  
##  Mean   :  0.4067   Mean   :  0.3669   Mean   : 0.2615  
##  3rd Qu.:  0.0000   3rd Qu.:  0.0000   3rd Qu.: 0.0000  
##  Max.   :921.0000   Max.   :431.0000   Max.   :74.0000  
##                                                         
##  advertising_serv     education         dating_serv       
##  Min.   :  0.0000   Min.   :  0.0000   Min.   :  0.00000  
##  1st Qu.:  0.0000   1st Qu.:  0.0000   1st Qu.:  0.00000  
##  Median :  0.0000   Median :  0.0000   Median :  0.00000  
##  Mean   :  0.2417   Mean   :  0.1612   Mean   :  0.04859  
##  3rd Qu.:  0.0000   3rd Qu.:  0.0000   3rd Qu.:  0.00000  
##  Max.   :341.0000   Max.   :251.0000   Max.   :109.00000  
##                                                           
##      no_cat         grocery_market_sum  restaurant_sum       atm_sum      
##  Min.   :   0.000   Min.   :    0.000   Min.   :    0.0   Min.   :     0  
##  1st Qu.:   0.000   1st Qu.:    0.000   1st Qu.:    0.0   1st Qu.:     0  
##  Median :   0.000   Median :    2.025   Median :    0.0   Median :    30  
##  Mean   :   3.242   Mean   :  266.700   Mean   :  192.0   Mean   :  1198  
##  3rd Qu.:   2.000   3rd Qu.:  212.040   3rd Qu.:  146.9   3rd Qu.:   839  
##  Max.   :1750.000   Max.   :30770.920   Max.   :27987.1   Max.   :169590  
##                                                                           
##  fast_food_sum      local_transport_sum  clothing_sum     
##  Min.   :    0.00   Min.   :    0.00    Min.   :     0.0  
##  1st Qu.:    0.00   1st Qu.:    0.00    1st Qu.:     0.0  
##  Median :    0.00   Median :    0.00    Median :     0.0  
##  Mean   :   63.55   Mean   :   72.98    Mean   :   193.4  
##  3rd Qu.:   51.86   3rd Qu.:   28.39    3rd Qu.:   139.9  
##  Max.   :10683.87   Max.   :34657.77    Max.   :108255.7  
##                                                           
##    retail_sum       household_sum      gas_service_sum   
##  Min.   :    0.00   Min.   :    0.00   Min.   :    0.00  
##  1st Qu.:    0.00   1st Qu.:    0.00   1st Qu.:    0.00  
##  Median :    0.00   Median :    0.00   Median :    0.00  
##  Mean   :  123.35   Mean   :  158.30   Mean   :   94.09  
##  3rd Qu.:   86.42   3rd Qu.:   61.45   3rd Qu.:   24.91  
##  Max.   :53229.67   Max.   :46243.17   Max.   :22330.86  
##                                                          
##  food_drinks_sum     taxicabs_sum     drug_pharma_sum   bars_clubs_sum    
##  Min.   :    0.00   Min.   :   0.00   Min.   :   0.00   Min.   :    0.00  
##  1st Qu.:    0.00   1st Qu.:   0.00   1st Qu.:   0.00   1st Qu.:    0.00  
##  Median :    0.00   Median :   0.00   Median :   0.00   Median :    0.00  
##  Mean   :   35.09   Mean   :  27.46   Mean   :  32.03   Mean   :   36.92  
##  3rd Qu.:   12.37   3rd Qu.:   0.00   3rd Qu.:  13.34   3rd Qu.:    5.99  
##  Max.   :13616.95   Max.   :9402.01   Max.   :8536.29   Max.   :41767.00  
##                                                                           
##  car_toll_parking_sum entertainment_sum utilities_sum     
##  Min.   :    0.0      Min.   :    0.0   Min.   :    0.00  
##  1st Qu.:    0.0      1st Qu.:    0.0   1st Qu.:    0.00  
##  Median :    0.0      Median :    0.0   Median :    0.00  
##  Mean   :   12.3      Mean   :   50.9   Mean   :   41.35  
##  3rd Qu.:    0.0      3rd Qu.:   22.5   3rd Qu.:    5.00  
##  Max.   :10896.0      Max.   :57889.7   Max.   :20867.86  
##                                                           
##  subscriptions_sum  bookstores_sum     business_org_serv_sum
##  Min.   :    0.00   Min.   :    0.00   Min.   :    0.00     
##  1st Qu.:    0.00   1st Qu.:    0.00   1st Qu.:    0.00     
##  Median :    0.00   Median :    0.00   Median :    0.00     
##  Mean   :   49.09   Mean   :   37.74   Mean   :   57.52     
##  3rd Qu.:    0.00   3rd Qu.:    0.00   3rd Qu.:    6.61     
##  Max.   :55001.20   Max.   :15765.70   Max.   :36623.60     
##                                                             
##  hotel_lodge_sum    computer_electronic_sum  bakeries_sum     
##  Min.   :    0.00   Min.   :    0.00        Min.   :   0.000  
##  1st Qu.:    0.00   1st Qu.:    0.00        1st Qu.:   0.000  
##  Median :    0.00   Median :    0.00        Median :   0.000  
##  Mean   :  129.95   Mean   :   84.72        Mean   :   9.151  
##  3rd Qu.:   13.39   3rd Qu.:   12.28        3rd Qu.:   0.000  
##  Max.   :54631.60   Max.   :52656.00        Max.   :2835.200  
##                                                               
##  gambling_gaming_sum record_stores_sum   digital_goods_sum 
##  Min.   :     0.00   Min.   :    0.000   Min.   :    0.00  
##  1st Qu.:     0.00   1st Qu.:    0.000   1st Qu.:    0.00  
##  Median :     0.00   Median :    0.000   Median :    0.00  
##  Mean   :    60.87   Mean   :    8.281   Mean   :   12.76  
##  3rd Qu.:     0.00   3rd Qu.:    0.000   3rd Qu.:    0.00  
##  Max.   :110381.00   Max.   :22874.270   Max.   :22165.19  
##                                                            
##   airline_sum       beauty_stores_sum  transport_serv_sum 
##  Min.   :    0.00   Min.   :    0.00   Min.   :    0.000  
##  1st Qu.:    0.00   1st Qu.:    0.00   1st Qu.:    0.000  
##  Median :    0.00   Median :    0.00   Median :    0.000  
##  Mean   :   98.08   Mean   :   25.09   Mean   :    7.888  
##  3rd Qu.:    0.00   3rd Qu.:    0.00   3rd Qu.:    0.000  
##  Max.   :47573.15   Max.   :28088.09   Max.   :16476.000  
##                                                           
##  fines_tax_gov_sum  money_financial_sum professional_serv_sum
##  Min.   :    0.00   Min.   :    0.00    Min.   :    0.00     
##  1st Qu.:    0.00   1st Qu.:    0.00    1st Qu.:    0.00     
##  Median :    0.00   Median :    0.00    Median :    0.00     
##  Mean   :   22.08   Mean   :   90.73    Mean   :   19.98     
##  3rd Qu.:    0.00   3rd Qu.:    0.00    3rd Qu.:    0.00     
##  Max.   :70762.28   Max.   :69678.22    Max.   :30000.00     
##                                                              
##  discount_stores_sum travel_agencies_sum computer_data_serv_sum
##  Min.   :    0.00    Min.   :    0.0     Min.   :    0.00      
##  1st Qu.:    0.00    1st Qu.:    0.0     1st Qu.:    0.00      
##  Median :    0.00    Median :    0.0     Median :    0.00      
##  Mean   :   11.87    Mean   :   61.1     Mean   :   13.79      
##  3rd Qu.:    0.00    3rd Qu.:    0.0     3rd Qu.:    0.00      
##  Max.   :44565.12    Max.   :35095.8     Max.   :28945.56      
##                                                                
##  car_rental_sum     health_serv_sum    advertising_serv_sum
##  Min.   :    0.00   Min.   :    0.00   Min.   :     0.00   
##  1st Qu.:    0.00   1st Qu.:    0.00   1st Qu.:     0.00   
##  Median :    0.00   Median :    0.00   Median :     0.00   
##  Mean   :   25.05   Mean   :   20.98   Mean   :    17.11   
##  3rd Qu.:    0.00   3rd Qu.:    0.00   3rd Qu.:     0.00   
##  Max.   :34410.49   Max.   :18238.78   Max.   :151631.37   
##                                                            
##  education_sum      dating_serv_sum      no_cat_sum           cohort     
##  Min.   :    0.00   Min.   :   0.000   Min.   :    0.00   2018-09:82932  
##  1st Qu.:    0.00   1st Qu.:   0.000   1st Qu.:    0.00   2018-10:80820  
##  Median :    0.00   Median :   0.000   Median :    0.00                  
##  Mean   :   17.88   Mean   :   1.141   Mean   :  171.23                  
##  3rd Qu.:    0.00   3rd Qu.:   0.000   3rd Qu.:   70.99                  
##  Max.   :32999.97   Max.   :5812.690   Max.   :51036.75                  
##                                                                          
##          continent      cntry_nationality     round           
##  AFRICA       :  3417   FRANCE :46582     Min.   :-1625597.0  
##  ASIA         :  4726   ITALY  :36359     1st Qu.:    -180.4  
##  EUROPE       :146205   GERMANY:22804     Median :       5.0  
##  NORTH AMERICA:  2426   SPAIN  :13328     Mean   :    2436.6  
##  OCEANIA      :   473   BRAZIL : 4780     3rd Qu.:    2010.0  
##  SOUTH AMERICA:  6505   AUSTRIA: 3568     Max.   :  504132.0  
##                         (Other):36331                         
##     pnl_cat      
##  neg_pnl: 57398  
##  pos_pnl:106354  
##                  
##                  
##                  
##                  
## 
# REMOVE users that did not make an external deposit
# (77% made at least 1 ext deposit)
df1 <- df[df$mau_act > 0,]
dim(df1)
## [1] 125356    140
mau0 <- df[df$mau_act == 0,]

# split Business accounts 
bus <- df1[df1$product_id %in% c('BUSINESS_CARD','BUSINESS_BLACK'),]
dim(bus)
## [1] 11502   140
#split SAUs (users with >= 1 week SAU flag) 
# 11% of users have at least 1 week of SAU flag
summary(df1$months_sau)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.8654  0.0000 15.0000
sau <- df1[df1$months_sau > 0,]
dim(sau)
## [1] 18031   140
# TYPICAL N26 USERS
# NON-business accounts N= 97031
#df1 <- df1[(df1$months_sau == 0),] 
df1 <- df1[!df1$product_id %in% c('BUSINESS_CARD','BUSINESS_BLACK'),]

dim(df1)
## [1] 113854    140
head(df1)
##   id        user_created                kycc membership product_id market
## 1  1 2015-05-25 18:23:10 2018-09-03 15:29:47   standard   STANDARD    DEU
## 2  2 2015-07-20 08:56:19 2018-10-26 17:03:30   standard   STANDARD    AUT
## 3  3 2015-07-29 11:20:35 2018-10-24 13:13:25   standard   STANDARD       
## 4  4 2015-08-26 14:07:42 2018-10-12 11:14:12   standard   STANDARD    AUT
## 5  5 2015-08-29 13:43:24 2018-09-29 14:28:19   standard   STANDARD       
## 7  7 2015-09-24 18:02:53 2018-09-17 14:45:10   standard   STANDARD       
##   nationality nat_status        referral  closed_at kycc_closed_days
## 1         DEU     native                                           0
## 2         AUT     native                                           0
## 3         AUT     native                                           0
## 4         AUT     native                 2019-11-28              412
## 5         AUT     native friend_referral                           0
## 7         AUT     native                                           0
##   gender age_group age weeks_wau_txn mau_txn months_sau mau_act n_pt_dom
## 1   MALE     25-29  30            40      12          0      12        8
## 2   MALE     25-29  27             3       2          0       1        1
## 3   MALE     45-49  48            13       8          0       4        0
## 4   MALE     30-34  32             4       3          0       2        0
## 5   MALE     30-34  35            35      11          0       7       39
## 7   MALE     25-29  27            51      12          0       6      120
##   n_pt_intra n_pt_inter n_pt_ecomm n_pt_dom_atm n_pt_intra_atm
## 1        105          0         53            2              2
## 2          0          0          0            0              0
## 3          4          3          7            0              1
## 4          0          0          0            3              0
## 5         23          0         13            6              1
## 7          5          0         11            0              0
##   n_pt_inter_atm n_dt n_dd n_ft n_cash26 n_ct n_wu n_space_ct n_space_dt
## 1              0    0    0    1        0   23    0          5          5
## 2              0    0    0    1        0    2    0          0          0
## 3              0    0    0    0        3    2    0          0          0
## 4              0    0    0    0        0    3    0          0          0
## 5              0    0    0    0        0   10    0          0          0
## 7              0    1    0    0        0   11    0          0          0
##   pt_dom_sum pt_intra_sum pt_inter_sum pt_ecomm_sum pt_dom_atm_sum
## 1     124.73      1143.68         0.00      1693.19             20
## 2      51.60         0.00         0.00         0.00              0
## 3       0.00        42.79       338.02      1045.59              0
## 4       0.00         0.00         0.00         0.00             30
## 5    2420.15       319.73         0.00       539.61           1260
## 7     659.91        35.95         0.00        62.09              0
##   pt_intra_atm_sum pt_inter_atm_sum dt_sum dd_sum ft_sum cash26_sum
## 1            70.00                0      0      0   3.25          0
## 2             0.00                0      0      0  31.00          0
## 3           174.43                0      0      0   0.00       1000
## 4             0.00                0      0      0   0.00          0
## 5           120.00                0      0      0   0.00          0
## 7             0.00                0     10      0   0.00          0
##    ct_sum wu_sum avg_primary_bal avg_spaces_bal n_spaces avg_space_ct
## 1 2815.76      0         -86.437              0        2     28.36879
## 2  101.00      0          15.771              0        0      0.00000
## 3  850.00      0         435.727              0        0      0.00000
## 4   71.90      0         -40.150              0        2      0.00000
## 5 4826.00      0         312.121              0        0      0.00000
## 7  810.00      0          60.531              0        0      0.00000
##   avg_space_dt n_ext_out n_ext_in ext_out_sum ext_in_sum grocery_market
## 1     28.36879       171       24     3054.85    2817.26             14
## 2      0.00000         2        2       82.60     101.00              0
## 3      0.00000        15        5     1600.83    1850.00              0
## 4      0.00000         3        3       30.00      71.90              0
## 5      0.00000        82       12     4659.49    4871.05             10
## 7      0.00000       138       17      771.14     852.71             90
##   restaurant atm fast_food local_transport clothing retail household
## 1         20   4         9               4        0      4         2
## 2          1   0         0               0        0      0         0
## 3          4   1         0               0        1      0         0
## 4          0   3         0               0        0      0         0
## 5          1   7         0               0        4      2         0
## 7          4   0        12              10        3      0         0
##   gas_service food_drinks taxicabs drug_pharma bars_clubs car_toll_parking
## 1           1           9       10           2         36                0
## 2           0           0        0           0          0                0
## 3           0           0        1           0          0                0
## 4           0           0        0           0          0                0
## 5           1           0       21           2          0                0
## 7           0           0        0           0          0                0
##   entertainment utilities subscriptions bookstores business_org_serv
## 1             0         0             3          0                 0
## 2             0         0             0          0                 0
## 3             0         0             0          0                 1
## 4             0         0             0          0                 0
## 5             0         0             1          0                 0
## 7             0         0            11          1                 0
##   hotel_lodge computer_electronic bakeries gambling_gaming record_stores
## 1           1                   1        0               0             0
## 2           0                   0        0               0             0
## 3           0                   0        0               0             0
## 4           0                   0        0               0             0
## 5           1                  21        1               0             0
## 7           0                   2        1               0             0
##   digital_goods airline beauty_stores transport_serv fines_tax_gov
## 1             4       1             0              4             1
## 2             0       0             0              0             0
## 3             0       3             0              0             0
## 4             0       0             0              0             0
## 5             6       0             0              0             0
## 7             0       0             0              0             0
##   money_financial professional_serv discount_stores travel_agencies
## 1              35                 4               0               1
## 2               0                 0               0               0
## 3               0                 0               0               3
## 4               0                 0               0               0
## 5               0                 0               2               0
## 7               0                 0               0               0
##   computer_data_serv car_rental health_serv advertising_serv education
## 1                  0          0           0                0         0
## 2                  0          0           0                0         0
## 3                  0          0           0                0         0
## 4                  0          0           0                0         0
## 5                  0          0           0                0         0
## 7                  0          0           0                0         0
##   dating_serv no_cat grocery_market_sum restaurant_sum atm_sum
## 1           0      0             178.34         298.70   90.00
## 2           0      0               0.00          51.60    0.00
## 3           0      1               0.00          42.79  174.43
## 4           0      0               0.00           0.00   30.00
## 5           0      2             418.86          15.00 1380.00
## 7           0      2             445.88          62.40    0.00
##   fast_food_sum local_transport_sum clothing_sum retail_sum household_sum
## 1         60.20               45.85         0.00      18.75          21.5
## 2          0.00                0.00         0.00       0.00           0.0
## 3          0.00                0.00       139.00       0.00           0.0
## 4          0.00                0.00         0.00       0.00           0.0
## 5          0.00                0.00       257.51     172.86           0.0
## 7         71.57               59.40         4.02       0.00           0.0
##   gas_service_sum food_drinks_sum taxicabs_sum drug_pharma_sum
## 1             3.8            35.5       110.60           12.10
## 2             0.0             0.0         0.00            0.00
## 3             0.0             0.0        15.79            0.00
## 4             0.0             0.0         0.00            0.00
## 5            13.9             0.0       188.80          119.24
## 7             0.0             0.0         0.00            0.00
##   bars_clubs_sum car_toll_parking_sum entertainment_sum utilities_sum
## 1          394.1                    0                 0             0
## 2            0.0                    0                 0             0
## 3            0.0                    0                 0             0
## 4            0.0                    0                 0             0
## 5            0.0                    0                 0             0
## 7            0.0                    0                 0             0
##   subscriptions_sum bookstores_sum business_org_serv_sum hotel_lodge_sum
## 1              2.80            0.0                   0.0            11.8
## 2              0.00            0.0                   0.0             0.0
## 3              0.00            0.0                  70.5             0.0
## 4              0.00            0.0                   0.0             0.0
## 5             67.40            0.0                   0.0            25.8
## 7             58.68            3.1                   0.0             0.0
##   computer_electronic_sum bakeries_sum gambling_gaming_sum
## 1                    9.99         0.00                   0
## 2                    0.00         0.00                   0
## 3                    0.00         0.00                   0
## 4                    0.00         0.00                   0
## 5                 1574.55         2.95                   0
## 7                   35.34         2.20                   0
##   record_stores_sum digital_goods_sum airline_sum beauty_stores_sum
## 1                 0             38.95       30.00                 0
## 2                 0              0.00        0.00                 0
## 3                 0              0.00      716.03                 0
## 4                 0              0.00        0.00                 0
## 5                 0            115.93        0.00                 0
## 7                 0              0.00        0.00                 0
##   transport_serv_sum fines_tax_gov_sum money_financial_sum
## 1               32.1                15              1465.3
## 2                0.0                 0                 0.0
## 3                0.0                 0                 0.0
## 4                0.0                 0                 0.0
## 5                0.0                 0                 0.0
## 7                0.0                 0                 0.0
##   professional_serv_sum discount_stores_sum travel_agencies_sum
## 1                126.22                0.00               50.00
## 2                  0.00                0.00                0.00
## 3                  0.00                0.00              258.35
## 4                  0.00                0.00                0.00
## 5                  0.00              172.79                0.00
## 7                  0.00                0.00                0.00
##   computer_data_serv_sum car_rental_sum health_serv_sum
## 1                      0              0               0
## 2                      0              0               0
## 3                      0              0               0
## 4                      0              0               0
## 5                      0              0               0
## 7                      0              0               0
##   advertising_serv_sum education_sum dating_serv_sum no_cat_sum  cohort
## 1                    0             0               0       0.00 2018-09
## 2                    0             0               0       0.00 2018-10
## 3                    0             0               0     183.94 2018-10
## 4                    0             0               0       0.00 2018-10
## 5                    0             0               0     133.90 2018-09
## 7                    0             0               0      15.36 2018-09
##   continent cntry_nationality    round pnl_cat
## 1    EUROPE           GERMANY  1694.01 pos_pnl
## 2    EUROPE           AUSTRIA    23.00 pos_pnl
## 3    EUROPE           AUSTRIA  2185.00 pos_pnl
## 4    EUROPE           AUSTRIA -2541.52 neg_pnl
## 5    EUROPE           AUSTRIA   361.00 pos_pnl
## 7    EUROPE           AUSTRIA   157.00 pos_pnl
df_fin <- df1[,14:135]

# FREQUENCY OF USERS FOR EACH FEATURE
library("ggplot2")
d = NULL
for (i in 1:dim(df_fin)[2]){
  feat = df_fin[,1:122]
  sum = sum((feat[,i]) & (!is.na(feat[,i])) != 0)
  total = dim(feat)[1]
  percent = sum/total
  feature = colnames(feat)[i]
  print(percent)
  d = rbind(d, data.frame(percent,feature))
}
## [1] 1
## [1] 0.9994203
## [1] 0.9991656
## [1] 0.1477594
## [1] 1
## [1] 0.7868147
## [1] 0.5802256
## [1] 0.2499517
## [1] 0.7516117
## [1] 0.6709031
## [1] 0.2399652
## [1] 0.1543731
## [1] 0.5966325
## [1] 0.2154777
## [1] 0.2018287
## [1] 0.05893513
## [1] 0.9975495
## [1] 0.230172
## [1] 0.2612996
## [1] 0.2613523
## [1] 0.7868147
## [1] 0.5802256
## [1] 0.2499517
## [1] 0.7516117
## [1] 0.6709031
## [1] 0.2399652
## [1] 0.1543731
## [1] 0.5966325
## [1] 0.2154777
## [1] 0.2018287
## [1] 0.05893513
## [1] 0.9975495
## [1] 0.230172
## [1] 0.9753281
## [1] 0.2124124
## [1] 0.3232034
## [1] 0.2612996
## [1] 0.2613523
## [1] 0.9602825
## [1] 1
## [1] 0.9602825
## [1] 1
## [1] 0.6697261
## [1] 0.629587
## [1] 0.7297592
## [1] 0.5890702
## [1] 0.4573401
## [1] 0.5839584
## [1] 0.6000492
## [1] 0.5276494
## [1] 0.4189927
## [1] 0.446124
## [1] 0.2404659
## [1] 0.4258612
## [1] 0.3539182
## [1] 0.2756249
## [1] 0.4293569
## [1] 0.3394259
## [1] 0.2762397
## [1] 0.319005
## [1] 0.363606
## [1] 0.3555167
## [1] 0.3753667
## [1] 0.2926028
## [1] 0.09953976
## [1] 0.1327226
## [1] 0.1921935
## [1] 0.2465702
## [1] 0.2774782
## [1] 0.1689357
## [1] 0.2429164
## [1] 0.1418483
## [1] 0.2108929
## [1] 0.156727
## [1] 0.1928874
## [1] 0.1437631
## [1] 0.09736153
## [1] 0.1535036
## [1] 0.07366452
## [1] 0.08308009
## [1] 0.01901558
## [1] 0.5390588
## [1] 0.6697261
## [1] 0.629587
## [1] 0.7297592
## [1] 0.5890702
## [1] 0.4573401
## [1] 0.5839584
## [1] 0.6000492
## [1] 0.5276494
## [1] 0.4189927
## [1] 0.446124
## [1] 0.2404659
## [1] 0.4258612
## [1] 0.3539182
## [1] 0.2756249
## [1] 0.4293569
## [1] 0.3394259
## [1] 0.2762397
## [1] 0.319005
## [1] 0.363606
## [1] 0.3555167
## [1] 0.3753667
## [1] 0.2926028
## [1] 0.09953976
## [1] 0.1327226
## [1] 0.1921935
## [1] 0.2465702
## [1] 0.2774782
## [1] 0.1689357
## [1] 0.2429164
## [1] 0.1418483
## [1] 0.2108929
## [1] 0.156727
## [1] 0.1928874
## [1] 0.1437631
## [1] 0.09736153
## [1] 0.1535036
## [1] 0.07366452
## [1] 0.08308009
## [1] 0.01901558
## [1] 0.5390588
d.sort <- d[order(d$percent,decreasing=TRUE),]
print(d.sort)
##        percent                 feature
## 1   1.00000000                     age
## 5   1.00000000                 mau_act
## 40  1.00000000                n_ext_in
## 42  1.00000000              ext_in_sum
## 2   0.99942031           weeks_wau_txn
## 3   0.99916560                 mau_txn
## 17  0.99754949                    n_ct
## 32  0.99754949                  ct_sum
## 34  0.97532805         avg_primary_bal
## 39  0.96028247               n_ext_out
## 41  0.96028247             ext_out_sum
## 6   0.78681469                n_pt_dom
## 21  0.78681469              pt_dom_sum
## 9   0.75161171              n_pt_ecomm
## 24  0.75161171            pt_ecomm_sum
## 45  0.72975917                     atm
## 85  0.72975917                 atm_sum
## 10  0.67090309            n_pt_dom_atm
## 25  0.67090309          pt_dom_atm_sum
## 43  0.66972614          grocery_market
## 83  0.66972614      grocery_market_sum
## 44  0.62958701              restaurant
## 84  0.62958701          restaurant_sum
## 49  0.60004919                  retail
## 89  0.60004919              retail_sum
## 13  0.59663253                    n_dt
## 28  0.59663253                  dt_sum
## 46  0.58907021               fast_food
## 86  0.58907021           fast_food_sum
## 48  0.58395840                clothing
## 88  0.58395840            clothing_sum
## 7   0.58022555              n_pt_intra
## 22  0.58022555            pt_intra_sum
## 82  0.53905879                  no_cat
## 122 0.53905879              no_cat_sum
## 50  0.52764945               household
## 90  0.52764945           household_sum
## 47  0.45734010         local_transport
## 87  0.45734010     local_transport_sum
## 52  0.44612398             food_drinks
## 92  0.44612398         food_drinks_sum
## 57  0.42935690           entertainment
## 97  0.42935690       entertainment_sum
## 54  0.42586119             drug_pharma
## 94  0.42586119         drug_pharma_sum
## 51  0.41899275             gas_service
## 91  0.41899275         gas_service_sum
## 63  0.37536670     computer_electronic
## 103 0.37536670 computer_electronic_sum
## 61  0.36360602       business_org_serv
## 101 0.36360602   business_org_serv_sum
## 62  0.35551671             hotel_lodge
## 102 0.35551671         hotel_lodge_sum
## 55  0.35391818              bars_clubs
## 95  0.35391818          bars_clubs_sum
## 58  0.33942593               utilities
## 98  0.33942593           utilities_sum
## 36  0.32320340                n_spaces
## 60  0.31900504              bookstores
## 100 0.31900504          bookstores_sum
## 64  0.29260281                bakeries
## 104 0.29260281            bakeries_sum
## 69  0.27747817           beauty_stores
## 109 0.27747817       beauty_stores_sum
## 59  0.27623975           subscriptions
## 99  0.27623975       subscriptions_sum
## 56  0.27562492        car_toll_parking
## 96  0.27562492    car_toll_parking_sum
## 20  0.26135226              n_space_dt
## 38  0.26135226            avg_space_dt
## 19  0.26129956              n_space_ct
## 37  0.26129956            avg_space_ct
## 8   0.24995169              n_pt_inter
## 23  0.24995169            pt_inter_sum
## 68  0.24657017                 airline
## 108 0.24657017             airline_sum
## 71  0.24291637           fines_tax_gov
## 111 0.24291637       fines_tax_gov_sum
## 53  0.24046586                taxicabs
## 93  0.24046586            taxicabs_sum
## 11  0.23996522          n_pt_intra_atm
## 26  0.23996522        pt_intra_atm_sum
## 18  0.23017197                    n_wu
## 33  0.23017197                  wu_sum
## 14  0.21547772                    n_dd
## 29  0.21547772                  dd_sum
## 35  0.21241239          avg_spaces_bal
## 73  0.21089290       professional_serv
## 113 0.21089290   professional_serv_sum
## 15  0.20182866                    n_ft
## 30  0.20182866                  ft_sum
## 75  0.19288738         travel_agencies
## 115 0.19288738     travel_agencies_sum
## 67  0.19219351           digital_goods
## 107 0.19219351       digital_goods_sum
## 70  0.16893565          transport_serv
## 110 0.16893565      transport_serv_sum
## 74  0.15672704         discount_stores
## 114 0.15672704     discount_stores_sum
## 12  0.15437314          n_pt_inter_atm
## 27  0.15437314        pt_inter_atm_sum
## 78  0.15350361             health_serv
## 118 0.15350361         health_serv_sum
## 4   0.14775941              months_sau
## 76  0.14376306      computer_data_serv
## 116 0.14376306  computer_data_serv_sum
## 72  0.14184833         money_financial
## 112 0.14184833     money_financial_sum
## 66  0.13272261           record_stores
## 106 0.13272261       record_stores_sum
## 65  0.09953976         gambling_gaming
## 105 0.09953976     gambling_gaming_sum
## 77  0.09736153              car_rental
## 117 0.09736153          car_rental_sum
## 80  0.08308009               education
## 120 0.08308009           education_sum
## 79  0.07366452        advertising_serv
## 119 0.07366452    advertising_serv_sum
## 16  0.05893513                n_cash26
## 31  0.05893513              cash26_sum
## 81  0.01901558             dating_serv
## 121 0.01901558         dating_serv_sum
# All defaults
ggplot(d, aes(x=reorder(feature, -percent), y=percent, fill=feature, label=scales::percent(percent)))+
  geom_bar(stat="identity", width=0.5) + ylab('Percent of Non-Zero Counts') + 
  xlab('Features') + 
  theme(axis.text.x = element_text(angle = 45, size=4, face='bold'), legend.position='none') +
  geom_text(position = position_dodge(width = .9),    # move to center of bars
            vjust = -0.5,    # nudge above top of bar
            size = 1) +
  scale_y_continuous(labels = scales::percent)

## Results

There are a few low frequency features that will not provide any meaningful variation to the analysis so we will drop them as they create NA’s in PCA.

# REMOVE low frequency features < 2%
rm.feat <- d[d[,1] <0.021,] 
df_fin <- df_fin[,!colnames(df_fin) %in% rm.feat[,2]]
#df_fin$months_sau <- NULL
dim(df_fin)
## [1] 113854    120
# remove SAU feature
# df_fin$months_sau <- NULL
# NOTE: tried both removing and including SAU feature and there were no significant differences in the final behavioral groupings but there were marginal improvements in classifying users so I decided to include this feature for the final results.
names(df_fin)
##   [1] "age"                     "weeks_wau_txn"          
##   [3] "mau_txn"                 "months_sau"             
##   [5] "mau_act"                 "n_pt_dom"               
##   [7] "n_pt_intra"              "n_pt_inter"             
##   [9] "n_pt_ecomm"              "n_pt_dom_atm"           
##  [11] "n_pt_intra_atm"          "n_pt_inter_atm"         
##  [13] "n_dt"                    "n_dd"                   
##  [15] "n_ft"                    "n_cash26"               
##  [17] "n_ct"                    "n_wu"                   
##  [19] "n_space_ct"              "n_space_dt"             
##  [21] "pt_dom_sum"              "pt_intra_sum"           
##  [23] "pt_inter_sum"            "pt_ecomm_sum"           
##  [25] "pt_dom_atm_sum"          "pt_intra_atm_sum"       
##  [27] "pt_inter_atm_sum"        "dt_sum"                 
##  [29] "dd_sum"                  "ft_sum"                 
##  [31] "cash26_sum"              "ct_sum"                 
##  [33] "wu_sum"                  "avg_primary_bal"        
##  [35] "avg_spaces_bal"          "n_spaces"               
##  [37] "avg_space_ct"            "avg_space_dt"           
##  [39] "n_ext_out"               "n_ext_in"               
##  [41] "ext_out_sum"             "ext_in_sum"             
##  [43] "grocery_market"          "restaurant"             
##  [45] "atm"                     "fast_food"              
##  [47] "local_transport"         "clothing"               
##  [49] "retail"                  "household"              
##  [51] "gas_service"             "food_drinks"            
##  [53] "taxicabs"                "drug_pharma"            
##  [55] "bars_clubs"              "car_toll_parking"       
##  [57] "entertainment"           "utilities"              
##  [59] "subscriptions"           "bookstores"             
##  [61] "business_org_serv"       "hotel_lodge"            
##  [63] "computer_electronic"     "bakeries"               
##  [65] "gambling_gaming"         "record_stores"          
##  [67] "digital_goods"           "airline"                
##  [69] "beauty_stores"           "transport_serv"         
##  [71] "fines_tax_gov"           "money_financial"        
##  [73] "professional_serv"       "discount_stores"        
##  [75] "travel_agencies"         "computer_data_serv"     
##  [77] "car_rental"              "health_serv"            
##  [79] "advertising_serv"        "education"              
##  [81] "no_cat"                  "grocery_market_sum"     
##  [83] "restaurant_sum"          "atm_sum"                
##  [85] "fast_food_sum"           "local_transport_sum"    
##  [87] "clothing_sum"            "retail_sum"             
##  [89] "household_sum"           "gas_service_sum"        
##  [91] "food_drinks_sum"         "taxicabs_sum"           
##  [93] "drug_pharma_sum"         "bars_clubs_sum"         
##  [95] "car_toll_parking_sum"    "entertainment_sum"      
##  [97] "utilities_sum"           "subscriptions_sum"      
##  [99] "bookstores_sum"          "business_org_serv_sum"  
## [101] "hotel_lodge_sum"         "computer_electronic_sum"
## [103] "bakeries_sum"            "gambling_gaming_sum"    
## [105] "record_stores_sum"       "digital_goods_sum"      
## [107] "airline_sum"             "beauty_stores_sum"      
## [109] "transport_serv_sum"      "fines_tax_gov_sum"      
## [111] "money_financial_sum"     "professional_serv_sum"  
## [113] "discount_stores_sum"     "travel_agencies_sum"    
## [115] "computer_data_serv_sum"  "car_rental_sum"         
## [117] "health_serv_sum"         "advertising_serv_sum"   
## [119] "education_sum"           "no_cat_sum"
# normalize COUNT DATA by ROW SUM --> to get relative importance of trxn count data for each user
df_cnt_row <- df_fin

# Make count data relative proportions by dividing by total transaction count for each user
names(df_cnt_row[,c(5:19)]) # txn count
##  [1] "mau_act"        "n_pt_dom"       "n_pt_intra"     "n_pt_inter"    
##  [5] "n_pt_ecomm"     "n_pt_dom_atm"   "n_pt_intra_atm" "n_pt_inter_atm"
##  [9] "n_dt"           "n_dd"           "n_ft"           "n_cash26"      
## [13] "n_ct"           "n_wu"           "n_space_ct"
names(df_cnt_row[,c(43:81)]) # mcc count
##  [1] "grocery_market"      "restaurant"          "atm"                
##  [4] "fast_food"           "local_transport"     "clothing"           
##  [7] "retail"              "household"           "gas_service"        
## [10] "food_drinks"         "taxicabs"            "drug_pharma"        
## [13] "bars_clubs"          "car_toll_parking"    "entertainment"      
## [16] "utilities"           "subscriptions"       "bookstores"         
## [19] "business_org_serv"   "hotel_lodge"         "computer_electronic"
## [22] "bakeries"            "gambling_gaming"     "record_stores"      
## [25] "digital_goods"       "airline"             "beauty_stores"      
## [28] "transport_serv"      "fines_tax_gov"       "money_financial"    
## [31] "professional_serv"   "discount_stores"     "travel_agencies"    
## [34] "computer_data_serv"  "car_rental"          "health_serv"        
## [37] "advertising_serv"    "education"           "no_cat"
df_cnt_row[,c(6:20)] <- df_cnt_row[,c(6:20)]/rowSums(df_cnt_row[,c(5:19)])
df_cnt_row[,c(43:81)] <- df_cnt_row[,c(43:81)]/rowSums(df_cnt_row[,c(43:81)])

df_cnt_row[is.na(df_cnt_row)] <- 0 

### Transforming skewed data

# log transform total volume for each transaction type
names(df_cnt_row[,c(21:35)])
##  [1] "pt_dom_sum"       "pt_intra_sum"     "pt_inter_sum"    
##  [4] "pt_ecomm_sum"     "pt_dom_atm_sum"   "pt_intra_atm_sum"
##  [7] "pt_inter_atm_sum" "dt_sum"           "dd_sum"          
## [10] "ft_sum"           "cash26_sum"       "ct_sum"          
## [13] "wu_sum"           "avg_primary_bal"  "avg_spaces_bal"
names(df_cnt_row[,c(82:120)])
##  [1] "grocery_market_sum"      "restaurant_sum"         
##  [3] "atm_sum"                 "fast_food_sum"          
##  [5] "local_transport_sum"     "clothing_sum"           
##  [7] "retail_sum"              "household_sum"          
##  [9] "gas_service_sum"         "food_drinks_sum"        
## [11] "taxicabs_sum"            "drug_pharma_sum"        
## [13] "bars_clubs_sum"          "car_toll_parking_sum"   
## [15] "entertainment_sum"       "utilities_sum"          
## [17] "subscriptions_sum"       "bookstores_sum"         
## [19] "business_org_serv_sum"   "hotel_lodge_sum"        
## [21] "computer_electronic_sum" "bakeries_sum"           
## [23] "gambling_gaming_sum"     "record_stores_sum"      
## [25] "digital_goods_sum"       "airline_sum"            
## [27] "beauty_stores_sum"       "transport_serv_sum"     
## [29] "fines_tax_gov_sum"       "money_financial_sum"    
## [31] "professional_serv_sum"   "discount_stores_sum"    
## [33] "travel_agencies_sum"     "computer_data_serv_sum" 
## [35] "car_rental_sum"          "health_serv_sum"        
## [37] "advertising_serv_sum"    "education_sum"          
## [39] "no_cat_sum"
df_cnt_row[df_cnt_row[,34] < 0,] <- 0 #replace negative balances with 0 before log transform

df_cnt_row[,c(21:35)] <- df_cnt_row[,c(21:35)] + 1 # add 1 to all data point bc there are 0's
df_cnt_row[,c(21:35)] <- log(df_cnt_row[,c(21:35)]) 

df_cnt_row[,c(82:120)] <- df_cnt_row[,c(82:120)] + 1 
df_cnt_row[,c(82:120)] <- log(df_cnt_row[,c(82:120)])
#### FEATURES to include in Analysis
#txn_mcc = df_cnt_row[,c(2:35,43:122)]
#mcc_sum = df_cnt_row[,c(81:119)]
txn = df_cnt_row[,c(2:35)]

set.seed(42) # This is to keep the sampling consistent for this example. Remove it to test different random samples, I ran this a few times with different random samples to see how consistent the results are with the samples size to ensure that I'm sampling enough to capture the variation in the complete data set.
users <-rownames(txn)
sample <- sample(users, size=20000)
df_sample = txn[rownames(txn) %in% sample,]
library("FactoMineR")
library("factoextra")
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
res.pca <- PCA(df_sample, graph=FALSE,ncp=50, scale.unit=T)
eig.val <- get_eigenvalue(res.pca)
head(eig.val,15)
##        eigenvalue variance.percent cumulative.variance.percent
## Dim.1   8.4359679        24.811670                    24.81167
## Dim.2   2.9387129         8.643273                    33.45494
## Dim.3   2.3562486         6.930143                    40.38509
## Dim.4   2.0738707         6.099620                    46.48471
## Dim.5   1.9946992         5.866762                    52.35147
## Dim.6   1.7373469         5.109844                    57.46131
## Dim.7   1.5484633         4.554304                    62.01562
## Dim.8   1.3849199         4.073294                    66.08891
## Dim.9   1.2495688         3.675202                    69.76411
## Dim.10  1.2367415         3.637475                    73.40159
## Dim.11  1.1380728         3.347273                    76.74886
## Dim.12  0.8872221         2.609477                    79.35834
## Dim.13  0.8111718         2.385799                    81.74414
## Dim.14  0.7697510         2.263974                    84.00811
## Dim.15  0.6447407         1.896296                    85.90441
# variance explained by each dimension/component
# ~40% of variance is explained by the first 3 components
fviz_eig(res.pca, addlabels = T, ylim=c(0,70))

var <- get_pca_var(res.pca)

# PCA correlation plot of features relative to the dimensions
library("corrplot")
## corrplot 0.84 loaded
corrplot(var$cos2, is.corr=FALSE,tl.cex=0.80)

library(scatterplot3d)
pc <- res.pca$ind$coord
scatterplot3d(pc[,1:3], pch=3, color="blue")

plot(eig.val[,3], xlab='Dimensions',ylab='Cumulative Variance',pch=19)

# head(eig.val,15)
# fviz_eig(res.pca, addlabels = T, ylim=c(0,70))
# corrplot(var$cos2, is.corr=FALSE,tl.cex=0.80)
# scatterplot3d(pc[,1:3], pch=3, color="blue")
# plot(eig.val[,3], xlab='Dimensions',ylab='Cumulative Variance',pch=19)

Analysis and Results

I used PCA to reduce the complexity of the data. By implementing PCA on the data before fitting the clustering model, we can reduce the complexity of the the clustering model and avoid overfitting (e.g. fitting noise) as well as improve the performance of the clustering model. I’ve fed in 35 features that result in 35 independent dimensions with each dimension capturing variation between correlated features. For instance, dim.3 captures that correlation between n_spaces_ct and n_space_dt. Note that each dim have orthogonal relationships thus completely independent of one another.

We can capture ~86% of the variance in the data by just using the first 15 PC’s. Based on the PCA correlation plots, PC 16-35 does not seem to capture much meaningful variation among the features so we can drop them.

# biplot with variables and individuals

# biplot for dim 1 and 2
fviz_pca_biplot(res.pca, repel = T,geom.ind = "point", axes=c(1,2),
                col.var = 'black', # Variables color
                col.ind = 'grey'  # Individuals color         
)

# biplot for dim 1 and 3
fviz_pca_biplot(res.pca, repel = T,geom.ind = "point", axes=c(1,3),
                col.var = 'black', # Variables color
                col.ind = 'grey'  # Individuals color         
)

# biplot for dim 2 and 3
fviz_pca_biplot(res.pca, repel = T,geom.ind = "point", axes=c(2,3),
                col.var = 'black', # Variables color
                col.ind = 'grey'  # Individuals color         
)

# biplot for dim 2 and 5
fviz_pca_biplot(res.pca, repel = T,geom.ind = "point", axes=c(2,5),
                col.var = 'black', # Variables color
                col.ind = 'grey'  # Individuals color         
)

## Results

These Biplots are 2 dimensional representations of the different principle components (dim) that describe the relationships between the features, which are represented by vectors. You can look at all different combinations of dims to investigate the relationships between the features. Its helpful to look at these plots in combination with the PCA correlation plot above. Grey dots represent the users. -High postively correlated features are clustered/grouped together and vectors are pointing in the same direction. -Negatively correlated features are positioned on opposite sides of the plot (arrows pointing in opposite directions) -Vectors that are far away from the origin (longest arrows) are well represented on the two dimensions (i.e dim1 and dim2 shows a strong relationship between mau_txn, pt_dom_atm_sum, ct_sum etc.)

# compute correlation matrix and rm NA's
#df.scaled = scale(df_fin, center=TRUE, scale=TRUE) # non-normalized for rowsums
df.scaled = scale(txn, center=TRUE, scale=TRUE)

df.scaled = scale(df_cnt_row, center=TRUE, scale=TRUE)
res.cor <- cor(df.scaled)

library('corrplot')
corrplot(res.cor, type="upper", order="hclust", 
         tl.col="black", 
         tl.srt=45,
         diag=FALSE,
         tl.cex=0.50,
         #addCoef.col="black"
         )

pc <- res.pca$ind$coord

# KMEANS CLUSTERING
library(factoextra)
comp <- pc[,1:15]

# K-means clustering
# Determine the correct number of clusters via weighted within sum of squares
gc() # Garbage collect
##            used  (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells  1194182  63.8    2378506  127.1         NA   2378506  127.1
## Vcells 81066594 618.5  166666367 1271.6    1024000 161958642 1235.7
wss <- (nrow(comp)-1)*sum(apply(comp,2,var))
for (i in 2:50) wss[i] <- sum(kmeans(comp, centers=i, nstart=25, iter.max=1000,algorithm="MacQueen")$withinss)
plot(1:50, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares", main="K-means: Identify optimal number of clusters",xaxt="n")
axis(1, at=1:50, labels=c(1:50))

# Elbow method indicates k = 10 

Hierarchical-Kmeans hybrid clustering Analysis

Some of the requirements for Kmeans clustering is… i) you must specify the number of clusters beforehand ii) the initial centroids of the clusters are chosen randomly

Because the intial centroids are randomly chosen, the output of kmeans clusters can vary between runs. To get around this issue, we can implement a hybrid of hierarchical/kmean algorithm.

  1. The algorithm computes hierarchical clustering and cuts the tree into k-clusters (we can use the elbow method K=8-11 candidates). Hierarchical clustering is a method that builds a tree by grouping similar data points (step wise) into groups by using a pairwise similarity matrix between observations.
  2. then it identifies the centroid by computing the mean of of the clusters.
  3. finally, Kmeans clustering is then implemented using the cluster centers defined in 2) as initial cluster centers.

Analysis

  1. To estimate the optimal number of clusters, I used the elbow method and ran a k-means clustering algorithm for k= 1 to k= 50 clusters to the within groups sum of squares for k = i.

Results

Identifying the optimal number of clusters is a hard problem to solve. While there are a variety of methods that we can be used to identify the optimal number of clusters, for our first attempt at clustering, we can just use the simplest approach, the elbow method. This method looks at the variability of within groups (within groups sum of squares) as a function of the number of clusters. The goal is to strike a balance between minizing the variation within clusters and not dividing users into too many groups that don’t make much sense.

The elbow of the curve gives us an estimate of the optimal number of clusters, which is k = 8 - 11 clusters. As seen here, this method cannot always unambigiously identify the optimal number of clusters particularly for data with overlapping clusters. Based on the PCA results, we can see that the observations are densely packed and the clusters in this data set could potentially have a relatively high degree of overlap. As a result, we’ll use another method to identify test this range of k values to determine the best number of clusters.

# Note that the cluster numbers are assigned randomly and will be different each time you rerun the clustering analysis so you will have look at the data to determine the cluster behaviors and its corresponding cluster number. 

# k=7
res.hc <- eclust(comp, "hclust", k = 7,
                 method = "ward.D2", graph = FALSE) 
grp <- res.hc$cluster
# Compute cluster centers
clus.centers <- aggregate(comp, list(grp), mean)
# Remove the first column
clus.centers <- clus.centers[, -1]
# Kmeans clustering using hieracrchical clustering defined cluster-centers
km.res2_7 <- eclust(comp, "kmeans", k = clus.centers, graph = FALSE)
fviz_silhouette(km.res2_7)
##   cluster size ave.sil.width
## 1       1 4207          0.20
## 2       2 4403          0.09
## 3       3 2326          0.04
## 4       4 1573          0.09
## 5       5 1374          0.16
## 6       6 4318          0.19
## 7       7 1799          0.17

# k=8
res.hc <- eclust(comp, "hclust", k = 8,
                 method = "ward.D2", graph = FALSE) 
grp <- res.hc$cluster
# Compute cluster centers
clus.centers <- aggregate(comp, list(grp), mean)
# Remove the first column
clus.centers <- clus.centers[, -1]
# Kmeans clustering using hieracrchical clustering defined cluster-centers
km.res2_8 <- eclust(comp, "kmeans", k = clus.centers, graph = FALSE)
fviz_silhouette(km.res2_8)
##   cluster size ave.sil.width
## 1       1 4173          0.19
## 2       2 4223          0.10
## 3       3  364          0.14
## 4       4 1566          0.09
## 5       5 1359          0.16
## 6       6 4294          0.18
## 7       7 2218          0.08
## 8       8 1803          0.16

#k=9
res.hc <- eclust(comp, "hclust", k = 9,
                 method = "ward.D2", graph = FALSE) 
grp <- res.hc$cluster
# Compute cluster centers
clus.centers <- aggregate(comp, list(grp), mean)
# Remove the first column
clus.centers <- clus.centers[, -1]
# Kmeans clustering using hieracrchical clustering defined cluster-centers
km.res2_9 <- eclust(comp, "kmeans", k = clus.centers, graph = FALSE)
fviz_silhouette(km.res2_9)
##   cluster size ave.sil.width
## 1       1 2719          0.38
## 2       2 3938          0.09
## 3       3  356          0.14
## 4       4 1541          0.09
## 5       5 1341          0.16
## 6       6 4216          0.16
## 7       7 2163          0.08
## 8       8 1795          0.16
## 9       9 1931          0.04

#k=10
res.hc <- eclust(comp, "hclust", k = 10,
                 method = "ward.D2", graph = FALSE) 
grp <- res.hc$cluster
# Compute cluster centers
clus.centers <- aggregate(comp, list(grp), mean)
# Remove the first column
clus.centers <- clus.centers[, -1]
# Kmeans clustering using hieracrchical clustering defined cluster-centers
km.res2_10 <- eclust(comp, "kmeans", k = clus.centers, graph = FALSE)
fviz_silhouette(km.res2_10)
##    cluster size ave.sil.width
## 1        1 2704          0.37
## 2        2 3757          0.10
## 3        3  348          0.14
## 4        4 1534          0.09
## 5        5 1337          0.16
## 6        6 4198          0.16
## 7        7 2134          0.08
## 8        8 1788          0.16
## 9        9 1342          0.21
## 10      10  858          0.18

#k=11
res.hc <- eclust(comp, "hclust", k = 11,
                 method = "ward.D2", graph = FALSE) 
grp <- res.hc$cluster
# Compute cluster centers
clus.centers <- aggregate(comp, list(grp), mean)
# Remove the first column
clus.centers <- clus.centers[, -1]
# Kmeans clustering using hieracrchical clustering defined cluster-centers
km.res2_11 <- eclust(comp, "kmeans", k = clus.centers, graph = FALSE)
fviz_silhouette(km.res2_11)
##    cluster size ave.sil.width
## 1        1 2377          0.41
## 2        2 2645          0.14
## 3        3  330          0.14
## 4        4 1475          0.08
## 5        5 1342          0.15
## 6        6 4181          0.17
## 7        7 1641          0.16
## 8        8 2179          0.08
## 9        9 1764          0.15
## 10      10 1292          0.21
## 11      11  774          0.19

# fviz_silhouette(km.res2_7)
# fviz_silhouette(km.res2_8)
# fviz_silhouette(km.res2_9)
# fviz_silhouette(km.res2_10)
# fviz_silhouette(km.res2_11)

Silohuette plots

Silohuette plots is another method of evaluating the optimal K value and determining the quality of the clusters. Silohouette widths provides information on the goodness of clustering -high positive values correspond to users that match well to a cluster and poorly with the neighboring clusters -negative values correspond to users that do not have a high degree of membership confidence and are users that fall on the boardline of clusters potentially belonging to more than one cluster (reference Appendix for an example of this).

As I have mentioned above, the clusters are quite dense and not clearly separated so there are clusters that overlap (reference Appendix) resulting in users belonging to more than one cluster. In the future, it would be ideal to implement a probablistic model that can assign probabilities of cluster membership to each user. But for an exploratory analysis, I think using this model will suffice and provide some useful insights on the key behavioral groups that exist within this cohort.

# total within cluster sum of squares is an estimate of the % within variation / total variation within clusters-- we want to minimize this number
# here we look at the reduction of total within sum of squares to look at marginal improvements between k values
print('% within cluster variation')
## [1] "% within cluster variation"
km.res2_7$tot.withinss/km.res2_7$totss #k=7
## [1] 0.5681779
km.res2_8$tot.withinss/km.res2_8$totss #k=8
## [1] 0.5405174
km.res2_9$tot.withinss/km.res2_9$totss #k=9
## [1] 0.5168768
km.res2_10$tot.withinss/km.res2_10$totss #k=10
## [1] 0.491871
km.res2_11$tot.withinss/km.res2_11$totss #k=11
## [1] 0.4716877
print('an adhoc estimate of improvement based on the differences between % within cluster variation')
## [1] "an adhoc estimate of improvement based on the differences between % within cluster variation"
km.res2_7$tot.withinss/km.res2_7$totss - km.res2_8$tot.withinss/km.res2_8$totss
## [1] 0.0276605
km.res2_8$tot.withinss/km.res2_8$totss - km.res2_9$tot.withinss/km.res2_9$totss
## [1] 0.02364059
km.res2_9$tot.withinss/km.res2_9$totss - km.res2_10$tot.withinss/km.res2_10$totss
## [1] 0.0250058
km.res2_10$tot.withinss/km.res2_10$totss - km.res2_11$tot.withinss/km.res2_11$totss
## [1] 0.02018335
# Comparing the avg cluster widths of clusters -- note that all of the values are in sequential order k= 1 to k = i
print('Average Silohuette width for each clusters K=9 and 10')
## [1] "Average Silohuette width for each clusters K=9 and 10"
km.res2_7$silinfo$clus.avg.widths
## [1] 0.19729105 0.08702396 0.04438300 0.08914760 0.15816017 0.18879858
## [7] 0.16833767
km.res2_8$silinfo$clus.avg.widths
## [1] 0.19345185 0.10188029 0.13525616 0.09016755 0.15876354 0.17941630
## [7] 0.07730085 0.16442455
km.res2_9$silinfo$clus.avg.widths
## [1] 0.38184257 0.08940424 0.13610435 0.08522383 0.15778181 0.16455655
## [7] 0.07791880 0.15953835 0.03873085
km.res2_10$silinfo$clus.avg.widths
##  [1] 0.36859409 0.09661772 0.13536666 0.08529343 0.15739041 0.15655218
##  [7] 0.08398400 0.16000159 0.20733814 0.18292200
km.res2_11$silinfo$clus.avg.widths
##  [1] 0.41164417 0.13890345 0.13865029 0.08101639 0.15065209 0.16570058
##  [7] 0.15726672 0.08012286 0.15240249 0.20696029 0.19458587

Results

Identifying the optimal k value to settle for is a difficult problem and it ends up being a judgement call based on the patterns that emerge from the data and how the clusters split from one K value to the next. I spent a lot of time looking into the data to assess the patterns that emerged from the features across clusters and how the clusters split when the I added another k value to the model. I also looked at % within cluster variation and the corresponding changes between k-values as well as the overall avg silouette widths to judge whether adding another cluster to the model improved the clustering.

Because average silohuette width gives us an estimate of the performance of the clusters, I compared the avg silohuette widths across k=7 - k=11 model. I found that there were some improvements in the k=9,10,11 model relative to k=7 and k=8.

I looked at the features and how it relates to differentiating the clusters (reference the barplots below) for k = 9 and k=10. 8 clusters show consistent behavioral patterns between k=9 and k=10 (i.e. secondary spenders, barely active users, international travelers, holding account, spaces power users, primary account, unconvinced, euro/international travelers, cash26ers). K=10 extracted a 10th group that corresponds to low activity friend referal users… which I thought was an interesting group.

I also looked into k=11 and didn’t really see any meaningful groups arise from dividing the data into an additional cluster.

As a result, I settled for k=10.

# k = 9
# names(km.res2_9)
# hk.df <- data.frame(km.res2_9$cluster)
# colnames(hk.df) <- c("cluster")

names(km.res2_10)
##  [1] "cluster"      "centers"      "totss"        "withinss"    
##  [5] "tot.withinss" "betweenss"    "size"         "iter"        
##  [9] "ifault"       "silinfo"      "nbclust"      "data"
hk.df <- data.frame(km.res2_10$cluster)
colnames(hk.df) <- c("cluster")

########### barplots for mean values of clusters
merged.df <- transform(merge(df,hk.df,by=0), row.names=Row.names, Row.names=NULL)
merged.df[,141]<-as.factor(merged.df[,141])
merged.df$round <- merged.df$round/100
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
merged.df$sau_cat <- case_when(merged.df$months_sau > 0 ~ 'SAU',
                               merged.df$months_sau == 0 ~ 'NON-SAU')

head(merged.df)
##            id        user_created                kycc membership
## 1           1 2015-05-25 18:23:10 2018-09-03 15:29:47   standard
## 10001   10001 2018-08-28 18:29:53 2018-09-25 14:56:08   standard
## 10002   10002 2018-08-28 18:30:59 2018-09-03 10:19:14   standard
## 10003   10003 2018-08-28 18:43:06 2018-09-01 15:21:16   standard
## 100053 100053 2018-10-03 06:17:59 2018-10-03 06:48:06   standard
## 100066 100066 2018-10-03 06:36:58 2018-10-03 06:54:24   standard
##        product_id market nationality nat_status        referral  closed_at
## 1        STANDARD    DEU         DEU     native                           
## 10001    STANDARD    FRA         FRA     native     paid_search           
## 10002    STANDARD    FRA         FRA     native friend_referral           
## 10003    STANDARD    DEU         DEU     native                           
## 100053   STANDARD    FRA         FRA     native     paid_search           
## 100066   STANDARD    FRA         FRA     native       app_store 2020-01-14
##        kycc_closed_days gender age_group age weeks_wau_txn mau_txn
## 1                     0   MALE     25-29  30            40      12
## 10001                 0 FEMALE     25-29  30            41      11
## 10002                 0 FEMALE     25-29  30            14       5
## 10003                 0   MALE     50-54  51             1       1
## 100053                0 FEMALE     35-39  37            39       9
## 100066              468   MALE     20-24  24            31       9
##        months_sau mau_act n_pt_dom n_pt_intra n_pt_inter n_pt_ecomm
## 1               0      12        8        105          0         53
## 10001           0       5       35          7        168         30
## 10002           1       4        5         16         58          4
## 10003           0       1        0          0          0          0
## 100053          0       4      161          8         22         12
## 100066          8       9      175          1          0         40
##        n_pt_dom_atm n_pt_intra_atm n_pt_inter_atm n_dt n_dd n_ft n_cash26
## 1                 2              2              0    0    0    1        0
## 10001             0              0             18    2    0    0        0
## 10002             2              1              3    0    0   12        0
## 10003             0              0              0    0    0    0        0
## 100053            6              0              1    0    0    0        0
## 100066            7              0              0    9    0    0        0
##        n_ct n_wu n_space_ct n_space_dt pt_dom_sum pt_intra_sum
## 1        23    0          5          5     124.73      1143.68
## 10001     6    0          0          0     405.95        14.44
## 10002    20    1          0          0      33.72       419.50
## 10003     1    0          0          0       0.00         0.00
## 100053    6    0          0          0    3504.54       368.41
## 100066   13    1         61         61    2264.30         4.50
##        pt_inter_sum pt_ecomm_sum pt_dom_atm_sum pt_intra_atm_sum
## 1              0.00      1693.19             20            70.00
## 10001       2495.37      3329.95              0             0.00
## 10002       1638.73       143.07            100            54.98
## 10003          0.00         0.00              0             0.00
## 100053      1466.41       494.22            160             0.00
## 100066         0.00      1257.81            760             0.00
##        pt_inter_atm_sum  dt_sum dd_sum ft_sum cash26_sum  ct_sum wu_sum
## 1                  0.00    0.00      0   3.25          0 2815.76      0
## 10001            750.14 2854.26      0   0.00          0 9849.20      0
## 10002            268.15    0.00      0 309.01          0 3165.00     15
## 10003              0.00    0.00      0   0.00          0  100.00      0
## 100053           481.44    0.00      0   0.00          0 5900.00      0
## 100066             0.00 5430.00      0   0.00          0 9850.79     15
##        avg_primary_bal avg_spaces_bal n_spaces avg_space_ct avg_space_dt
## 1              -86.437          0.000        2     28.36879     28.36879
## 10001          632.558          0.000        1      0.00000      0.00000
## 10002          168.195          0.000        2      0.00000      0.00000
## 10003          100.000          0.000        0      0.00000      0.00000
## 100053         746.481          0.000        0      0.00000      0.00000
## 100066          70.920         23.496       11   3337.64706   3337.64706
##        n_ext_out n_ext_in ext_out_sum ext_in_sum grocery_market restaurant
## 1            171       24     3054.85    2817.26             14         20
## 10001        260       10     9850.11    9916.19             54         39
## 10002        101       21     2967.16    3180.00              5         11
## 10003          0        1        0.00     100.00              0          0
## 100053       210        8     6475.02    6597.52             16          8
## 100066       232       15     9716.61    9885.74             44          5
##        atm fast_food local_transport clothing retail household gas_service
## 1        4         9               4        0      4         2           1
## 10001   18        31               7        3      2         3           1
## 10002    6         7               6       25      5         1           4
## 10003    0         0               0        0      0         0           0
## 100053   7        28               5        2      0        11           1
## 100066   7        45              10        4     13         5           0
##        food_drinks taxicabs drug_pharma bars_clubs car_toll_parking
## 1                9       10           2         36                0
## 10001           11        8          10          4                0
## 10002            0        2           3          0                2
## 10003            0        0           0          0                0
## 100053          74        7           0          1                0
## 100066           2        0           2          3                0
##        entertainment utilities subscriptions bookstores business_org_serv
## 1                  0         0             3          0                 0
## 10001              1         4             0          0                 0
## 10002              0         0             0          0                 0
## 10003              0         0             0          0                 0
## 100053             7         1             0          3                 6
## 100066            13         0             1          1                23
##        hotel_lodge computer_electronic bakeries gambling_gaming
## 1                1                   1        0               0
## 10001           25                   0        6               0
## 10002            3                   0        0               0
## 10003            0                   0        0               0
## 100053           0                   1        6               0
## 100066           2                   6        9               0
##        record_stores digital_goods airline beauty_stores transport_serv
## 1                  0             4       1             0              4
## 10001              0             0       2             0              7
## 10002              0             0       0             2              0
## 10003              0             0       0             0              0
## 100053             0             0       0             8              0
## 100066             0             6       0             3              0
##        fines_tax_gov money_financial professional_serv discount_stores
## 1                  1              35                 4               0
## 10001              5               1                 0               3
## 10002              2               0                 1               1
## 10003              0               0                 0               0
## 100053             2               0                 2               0
## 100066             0               0                15               0
##        travel_agencies computer_data_serv car_rental health_serv
## 1                    1                  0          0           0
## 10001                2                  0          0           1
## 10002                0                  0          0           0
## 10003                0                  0          0           0
## 100053               1                  3          3           4
## 100066               0                  0          0           0
##        advertising_serv education dating_serv no_cat grocery_market_sum
## 1                     0         0           0      0             178.34
## 10001                 0         0           0     10             489.12
## 10002                 0         0           0      3              51.94
## 10003                 0         0           0      0               0.00
## 100053                0         0           0      3             321.17
## 100066                0         0           0      4             511.95
##        restaurant_sum atm_sum fast_food_sum local_transport_sum
## 1              298.70   90.00         60.20               45.85
## 10001          496.27  750.14        215.86               29.85
## 10002          274.32  423.13         73.64               55.89
## 10003            0.00    0.00          0.00                0.00
## 100053         131.35  641.44        393.21               18.11
## 100066          65.30  760.00        476.49              221.90
##        clothing_sum retail_sum household_sum gas_service_sum
## 1              0.00      18.75         21.50            3.80
## 10001         33.88     590.21         61.94            1.60
## 10002        750.47      91.56         24.05           64.62
## 10003          0.00       0.00          0.00            0.00
## 100053       171.09       0.00        949.60           10.71
## 100066       410.34     155.56        105.70            0.00
##        food_drinks_sum taxicabs_sum drug_pharma_sum bars_clubs_sum
## 1                35.50       110.60           12.10          394.1
## 10001            90.49        98.69           63.03           67.5
## 10002             0.00        31.84           43.87            0.0
## 10003             0.00         0.00            0.00            0.0
## 100053          122.94       228.46            0.00           10.0
## 100066           25.00         0.00           32.86           42.0
##        car_toll_parking_sum entertainment_sum utilities_sum
## 1                      0.00              0.00          0.00
## 10001                  0.00              6.38         60.04
## 10002                 22.61              0.00          0.00
## 10003                  0.00              0.00          0.00
## 100053                 0.00            499.98         14.21
## 100066                 0.00            176.37          0.00
##        subscriptions_sum bookstores_sum business_org_serv_sum
## 1                    2.8           0.00                  0.00
## 10001                0.0           0.00                  0.00
## 10002                0.0           0.00                  0.00
## 10003                0.0           0.00                  0.00
## 100053               0.0          18.86                 90.49
## 100066              49.0          17.00                360.48
##        hotel_lodge_sum computer_electronic_sum bakeries_sum
## 1                11.80                    9.99         0.00
## 10001          1922.14                    0.00        65.41
## 10002           522.74                    0.00         0.00
## 10003             0.00                    0.00         0.00
## 100053            0.00                   75.00        35.40
## 100066           13.40                  133.07        67.08
##        gambling_gaming_sum record_stores_sum digital_goods_sum airline_sum
## 1                        0                 0             38.95        30.0
## 10001                    0                 0              0.00       350.7
## 10002                    0                 0              0.00         0.0
## 10003                    0                 0              0.00         0.0
## 100053                   0                 0              0.00         0.0
## 100066                   0                 0            101.93         0.0
##        beauty_stores_sum transport_serv_sum fines_tax_gov_sum
## 1                    0.0              32.10             15.00
## 10001                0.0              14.44             47.31
## 10002               65.2               0.00              3.80
## 10003                0.0               0.00              0.00
## 100053            1067.2               0.00             55.83
## 100066              57.0               0.00              0.00
##        money_financial_sum professional_serv_sum discount_stores_sum
## 1                  1465.30                126.22                0.00
## 10001               628.41                  0.00               60.00
## 10002                 0.00                  8.77                5.97
## 10003                 0.00                  0.00                0.00
## 100053                0.00                 14.05                0.00
## 100066                0.00                188.74                0.00
##        travel_agencies_sum computer_data_serv_sum car_rental_sum
## 1                    50.00                      0           0.00
## 10001               725.83                      0           0.00
## 10002                 0.00                      0           0.00
## 10003                 0.00                      0           0.00
## 100053              135.39                     50         213.82
## 100066                0.00                      0           0.00
##        health_serv_sum advertising_serv_sum education_sum dating_serv_sum
## 1                  0.0                    0             0               0
## 10001             51.5                    0             0               0
## 10002              0.0                    0             0               0
## 10003              0.0                    0             0               0
## 100053           397.0                    0             0               0
## 100066             0.0                    0             0               0
##        no_cat_sum  cohort continent cntry_nationality    round pnl_cat
## 1            0.00 2018-09    EUROPE           GERMANY  16.9401 pos_pnl
## 10001       75.11 2018-09    EUROPE            FRANCE  67.8300 pos_pnl
## 10002      143.73 2018-09    EUROPE            FRANCE  24.3000 pos_pnl
## 10003        0.00 2018-09    EUROPE           GERMANY  -1.0000 neg_pnl
## 100053     809.71 2018-10    EUROPE            FRANCE   4.9600 pos_pnl
## 100066     315.44 2018-10    EUROPE            FRANCE 203.5689 pos_pnl
##        cluster sau_cat
## 1            1 NON-SAU
## 10001        8 NON-SAU
## 10002        8     SAU
## 10003        1 NON-SAU
## 100053       8 NON-SAU
## 100066       5     SAU
demo.df <- merged.df[,c(4:9,12:13,136:142)]
library('reshape2')
library('reshape')
## 
## Attaching package: 'reshape'
## The following objects are masked from 'package:reshape2':
## 
##     colsplit, melt, recast
## The following object is masked from 'package:dplyr':
## 
##     rename
library('tidyr')
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:reshape':
## 
##     expand, smiths
## The following object is masked from 'package:reshape2':
## 
##     smiths
library('dplyr')
library(ggplot2)

library(ggplot2)

cluster.freq<- as.data.frame(table(merged.df[,141]))
cluster.freq['percent'] <- cluster.freq[,2]/200
cluster.freq
##    Var1 Freq percent
## 1     1 2704  13.520
## 2     2 3757  18.785
## 3     3  348   1.740
## 4     4 1534   7.670
## 5     5 1337   6.685
## 6     6 4198  20.990
## 7     7 2134  10.670
## 8     8 1788   8.940
## 9     9 1342   6.710
## 10   10  858   4.290
p <- ggplot(cluster.freq, aes(y=percent,x=as.factor(Var1),fill=as.factor(Var1))) +
  geom_bar(position="stack", stat="identity") + xlab("Clusters") + ylab("Percent") +
  theme(
    #legend.title = element_text(color = "black", size = 16),
    #legend.text = element_text(color = "black", size = 10,face="bold"),
    #legend.position='top',
    axis.text=element_text(size=16,face="bold")
  ) + geom_text(aes(label = round(percent,2), y = percent + 0.40), position = position_dodge(0.9),format_string='{:.1f}% ') 
## Warning: Ignoring unknown parameters: format_string
p + scale_fill_manual(values=c("#48AC98", "#E5C3C7","#CB7C7A", "#CAD7CA","#CDA35F","#C8D7E5","#266678","#F5D5B9","#737373","#CCCCCC",
                               "#D3D3D3",
                               "#999999",
                               "#8DA290","#CAA7BD",
                               "#B4BAD4",
                               '#B88BAD',
                               "#DBC4DF",
                               "#D2C1CE",
                               "#FDF7C2"))

# Evaluating the mean/median value of the features across clusters 
mydata = melt(merged.df[,c(14:25,141)],id=c('cluster'))

mean_df10 = as.data.frame(mydata %>% 
                            group_by(cluster,variable) %>%
                            summarise_all("mean"))

med_df10 = as.data.frame(mydata %>% 
                            group_by(cluster,variable) %>%
                            summarise_all("median"))
#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))

ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=as.factor(cluster))) + 
  geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
  facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")

ggplot(mean_df10[mean_df10[,2] %in% c('weeks_wau_txn','mau_txn','mau_act','months_sau'),], aes(x=as.factor(cluster), y=value,fill=as.factor(cluster))) + 
  geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25) + theme(legend.position = "none",axis.text=element_text(size=16,face="bold")) +
  facet_grid(variable ~ ., scales = "free_y") + scale_fill_manual(values=c("#48AC98", "#E5C3C7","#CB7C7A", "#CAD7CA","#CDA35F","#C8D7E5","#266678","#F5D5B9","#737373","#CCCCCC",
                               "#D3D3D3",
                               "#999999",
                               "#8DA290","#CAA7BD",
                               "#B4BAD4",
                               '#B88BAD',
                               "#DBC4DF",
                               "#D2C1CE",
                               "#FDF7C2"))

mydata = melt(merged.df[,c(21:33,139,141)],id=c('cluster'))

mean_df10 = as.data.frame(mydata %>% 
                            group_by(cluster,variable) %>%
                            summarise_all("mean"))

#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))

med_df10 = as.data.frame(mydata %>% 
                            group_by(cluster,variable) %>%
                            summarise_all("median"))

ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) + 
  geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
  facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")

mydata = melt(merged.df[,c(34:48,141)],id=c('cluster'))

mean_df10 = as.data.frame(mydata %>% 
                            group_by(cluster,variable) %>%
                            summarise_all("mean"))

#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))

ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) + 
  geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
  facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")

mydata = melt(merged.df[,c(56:68,141)],id=c('cluster'))

mean_df10 = as.data.frame(mydata %>% 
                            group_by(cluster,variable) %>%
                            summarise_all("mean"))

#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))

ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) + 
  geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
  facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")

mydata = melt(merged.df[,c(69:82,141)],id=c('cluster'))

mean_df10 = as.data.frame(mydata %>% 
                            group_by(cluster,variable) %>%
                            summarise_all("mean"))

#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))

ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) + 
  geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
  facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")

mean_df10 = as.data.frame(mydata %>% 
                            group_by(cluster,variable) %>%
                            summarise_all("mean"))

#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))

ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) + 
  geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
  facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")

mydata = melt(merged.df[,c(96:108,141)],id=c('cluster'))
mean_df10 = as.data.frame(mydata %>% 
                            group_by(cluster,variable) %>%
                            summarise_all("mean"))

#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))

ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) + 
  geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
  facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")

mydata = melt(merged.df[,c(109:124,141)],id=c('cluster'))
mean_df10 = as.data.frame(mydata %>% 
                            group_by(cluster,variable) %>%
                            summarise_all("mean"))

#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))

ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) + 
  geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
  facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")

mydata = melt(merged.df[,c(125:134,141)],id=c('cluster'))
mean_df10 = as.data.frame(mydata %>% 
                            group_by(cluster,variable) %>%
                            summarise_all("mean"))

#mean_df10[,1] <- factor(mean_df10[,1],levels=c(7,1,3,9,5,6,4,8,2,10))

ggplot(mean_df10, aes(x=as.factor(cluster), y=value,fill=variable)) + 
  geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25)+
  facet_grid(variable ~ ., scales = "free_y") + theme(legend.position = "none")

sau <- melt(table(merged.df[,c(141,142)]))
sau <- cast(melt(table(merged.df[,c(142,141)])),cluster~sau_cat)
sau['percent'] <- round(sau[,3]/(sau[,2]+sau[,3])*100,1)
#sau$cluster <- factor(sau[,1],levels=c(7,1,3,9,5,6,4,8,2,10)) # reorder factor levels 

ggplot(sau, aes(y=percent,x=as.factor(cluster),fill=as.factor(cluster))) + theme(legend.position = "none",axis.text=element_text(size=16,face="bold"))+
  geom_bar(position="stack", stat="identity") + xlab("Clusters- SAU's") + ylab("Percent SAU's") +
  theme(
    #legend.title = element_text(color = "black", size = 16),
    #legend.text = element_text(color = "black", size = 10,face="bold"),
    axis.text=element_text(size=16,face="bold") + scale_fill_manual(values=c("#48AC98", "#E5C3C7", "#CAD7CA","#266678","#C8D7E5","#CDA35F","#F5D5B9"))
  ) + scale_fill_manual(values=c("#48AC98", "#E5C3C7","#CB7C7A", "#CAD7CA","#CDA35F","#C8D7E5","#266678","#F5D5B9","#737373","#CCCCCC",
                               "#D3D3D3",
                               "#999999",
                               "#8DA290","#CAA7BD",
                               "#B4BAD4",
                               '#B88BAD',
                               "#DBC4DF",
                               "#D2C1CE",
                               "#FDF7C2")) + geom_text(aes(label = percent, y = percent + 0.40), position = position_dodge(0.9))

# EXAMPLE OF INTERNATIONAL AND SECONDARY SPENDER OVERLAPPING CLUSTERS 
# Note that the cluster numbers are assigned randomly and will be different each time you rerun the clustering analysis so you will have look at the data to determine the cluster behaviors. 
# in this clustering example, cluster 8 = international travelers and cluster 6 = secondary spenders

# Information on cluster neighbors and silohuette widths for each user
silo8 <- km.res2_8$silinfo$widths
silo9 <- km.res2_9$silinfo$widths
silo10 <- km.res2_10$silinfo$widths
silo11 <- km.res2_11$silinfo$widths

# users with negative silohuette values are located on the boarder of neighboring cluster
silo10.neg <-silo10[silo10[,3] < 0,] 
head(silo10.neg) # users with negative cluster widths and corresponding neighboring clusters
##        cluster neighbor     sil_width
## 142442       2        1 -0.0004527271
## 7080         2        6 -0.0005589185
## 126911       2        6 -0.0008426847
## 140313       2        1 -0.0008540853
## 18539        2        9 -0.0008883909
## 86200        2        6 -0.0008939617
# example of negative width values for international travelers cluster
# ~65% of these users are neighbors with users from Secondary spender cluster 
table(silo10.neg[silo10.neg[,1] == 8,2]) #change cluster number accordingly
## 
##   1   2   4   6   9 
##  37  49   2 174   4
length(silo10.neg[silo10.neg[,1] == 8,2])
## [1] 266
# in this clustering example, cluster 8 = international travelers and cluster 6 = secondary spenders
idx <- rownames(silo10.neg[(silo10.neg[,1] == 8) & (silo10.neg[,2] == 6),])
m.pc <- transform(merge(pc,hk.df,by=0), row.names=Row.names, Row.names=NULL)
pc.overlap <- m.pc[(m.pc[,35]==8) | (m.pc[,35]==6),]
pc.overlap$colors <- case_when(rownames(pc.overlap) %in% idx ~ "#E69F00", #overlap (yellow)
                             (!rownames(pc.overlap) %in% idx) & (pc.overlap$cluster == 8) ~ "#999999", #international travelers (grey)
                              pc.overlap$cluster == 6 ~ "#56B4E9" #secondary spenders (blue)
                             )

pc.overlap$cluster1 <- case_when(rownames(pc.overlap) %in% idx ~ "overlap", #overlap (yellow)
                             (!rownames(pc.overlap) %in% idx) & (pc.overlap$cluster == 8) ~ "inter_traveler", #international travelers (grey)
                              pc.overlap$cluster == 6 ~ "secondary_spender" #secondary spenders (blue)
                             )

# SCATTER PLOT OF THE CLUSTER OVERLAP
library(scatterplot3d)
scatterplot3d(pc.overlap[,c(1,3,2)], pch=6, color=pc.overlap$colors)
legend("topleft", legend=c("Inter-Travelers", "Secondary-Spenders","Overlapping users"),
       col=c("#999999", "#56B4E9","#E69F00"), cex=1,pch=6)

scatterplot3d(pc.overlap[,c(2,3,1)], pch=6, color=pc.overlap$colors)
legend("topleft", legend=c("Inter-Travelers", "Secondary-Spenders","Overlapping users"),
       col=c("#999999", "#56B4E9","#E69F00"), cex=1,pch=6)

scatterplot3d(pc.overlap[,c(3,2,1)], pch=6, color=pc.overlap$colors)
legend("topleft", legend=c("Inter-Travelers", "Secondary-Spenders","Overlapping users"),
       col=c("#999999", "#56B4E9","#E69F00"), cex=1,pch=6)

scatterplot3d(pc.overlap[,c(1,2,3)], pch=6, color=pc.overlap$colors)
legend("topleft", legend=c("Inter-Travelers", "Secondary-Spenders","Overlapping users"),
       col=c("#999999", "#56B4E9","#E69F00"), cex=1,pch=6)

plot(pc.overlap[,1],pc.overlap[,2],col=pc.overlap$colors,pch=20,xlab='dim1',ylab='dim2')
legend("topleft", legend=c("Inter-Travelers", "Secondary-Spenders","Overlapping users"),
       col=c("#999999", "#56B4E9","#E69F00"), cex=1,pch=20)

# Lets take a look at the differences between these 3 groups 
feature.df <- transform(merge(df,pc.overlap,by=0), row.names=Row.names, Row.names=NULL)

features <- feature.df[,colnames(feature.df) %in% c("weeks_wau_txn","pt_dom_sum","pt_inter_sum","cluster1","colors")]
features <- melt(features,id=c('cluster1',"colors"))

mean_df10 = as.data.frame(features %>% 
                            group_by(cluster1,colors,variable) %>%
                            summarise_all("mean"))

ggplot(mean_df10, aes(x=as.factor(cluster1), y=value,fill=as.factor(cluster1))) + 
  geom_bar(stat="identity") + geom_text(aes(label=round(value)), position=position_dodge(width=0.9), vjust=-0.25) + theme(legend.position = "none",axis.text=element_text(size=16,face="bold")) +
  facet_grid(variable ~ ., scales = "free_y") + scale_fill_manual(values=c("inter_traveler"="#999999","overlap" = "#E69F00", "secondary_spender"="#56B4E9")) 

## Results Note that the cluster numbers are assigned randomly and will be different each time you rerun the clustering analysis so you will have look at the data to determine the cluster behaviors.

The yellow dots are the users with negative width values from the cluster corresponding to international travelers (grey). These users are located next to the secondary spender cluster (blue). This suggests that these users may exhibit behavior that are intermediate between the two groups. The overlapping customers use our product for both international travel and domestic spending and seem to be more active than users that are using the app primarily for international spend.